Loading packages

library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.6
## ✔ tidyr   0.8.1     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(janitor)
library(ClassifyR)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind,
##     colMeans, colnames, colSums, dirname, do.call, duplicated,
##     eval, evalq, Filter, Find, get, grep, grepl, intersect,
##     is.unsorted, lapply, lengths, Map, mapply, match, mget, order,
##     paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind,
##     Reduce, rowMeans, rownames, rowSums, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which, which.max,
##     which.min
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:tidyr':
## 
##     expand
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: MultiAssayExperiment
## Loading required package: BiocParallel

Loading data

rawBattingData = read_csv("cricinfo-statsguru-data/Test Matches - Batting.csv") %>% 
  janitor::clean_names(case = "small_camel")
## Parsed with column specification:
## cols(
##   Player = col_character(),
##   `Career Span` = col_character(),
##   `Career Start` = col_integer(),
##   `Career End` = col_integer(),
##   `Matches Played` = col_integer(),
##   `Innings Batted` = col_character(),
##   `Not Outs` = col_character(),
##   `Runs Scored` = col_character(),
##   `Highest Innings Score` = col_character(),
##   `Highest Innings Score Num` = col_character(),
##   `Batting Avg` = col_character(),
##   `Hundreds Scored` = col_character(),
##   `Scores Of Fifty Or More` = col_character(),
##   `Ducks Scored` = col_character(),
##   Country = col_character(),
##   `Player Count` = col_integer(),
##   `10000+ Runs Scored` = col_integer(),
##   `50+ Batting Avg` = col_integer()
## )
glimpse(rawBattingData)
## Observations: 2,918
## Variables: 18
## $ player                 <chr> "AN Cook (2006-2018)", "GA Gooch (1975-...
## $ careerSpan             <chr> "2006-2018", "1975-1995", "1990-2003", ...
## $ careerStart            <int> 2006, 1975, 1990, 1978, 2005, 1964, 198...
## $ careerEnd              <int> 2018, 1995, 2003, 1992, 2014, 1982, 200...
## $ matchesPlayed          <int> 156, 118, 133, 117, 104, 108, 115, 118,...
## $ inningsBatted          <chr> "282", "215", "235", "204", "181", "193...
## $ notOuts                <chr> "16", "6", "21", "18", "8", "23", "7", ...
## $ runsScored             <chr> "12145", "8900", "8463", "8231", "8181"...
## $ highestInningsScore    <chr> "294", "333", "190", "215", "227", "246...
## $ highestInningsScoreNum <chr> "294", "333", "190", "215", "227", "246...
## $ battingAvg             <chr> "45.65", "42.58", "39.54", "44.25", "47...
## $ hundredsScored         <chr> "32", "20", "15", "18", "23", "22", "16...
## $ scoresOfFiftyOrMore    <chr> "56", "46", "45", "39", "35", "42", "46...
## $ ducksScored            <chr> "8", "13", "14", "7", "10", "10", "20",...
## $ country                <chr> "England", "England", "England", "Engla...
## $ playerCount            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ x10000RunsScored       <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ x50BattingAvg          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, ...
rawBowlingData = read_csv("cricinfo-statsguru-data/Test Matches - Bowling.csv") %>% 
  janitor::clean_names(case = "small_camel")
## Parsed with column specification:
## cols(
##   Player = col_character(),
##   `Innings Bowled In` = col_character(),
##   `Balls Bowled` = col_character(),
##   `Runs Conceded` = col_character(),
##   `Wickets Taken` = col_character(),
##   `Best Bowling In An Innings` = col_character(),
##   `Best Bowling In A Match` = col_character(),
##   `Bowling Avg` = col_character(),
##   `Economy Rate` = col_character(),
##   `Bowling Strike Rate` = col_character(),
##   `Five Wickets In An Innings` = col_character(),
##   `Ten Wickets In A Match` = col_character(),
##   `300+ Wickets Taken` = col_integer(),
##   `<25.00 Bowling Avg` = col_integer()
## )
glimpse(rawBowlingData)
## Observations: 2,918
## Variables: 14
## $ player                 <chr> "JM Anderson (2003-2018)", "SCJ Broad (...
## $ inningsBowledIn        <chr> "257", "215", "168", "165", "127", "151...
## $ ballsBowled            <chr> "30398", "24346", "21815", "17357", "15...
## $ runsConceded           <chr> "14705", "12050", "10878", "8190", "662...
## $ wicketsTaken           <chr> "540", "417", "383", "325", "307", "297...
## $ bestBowlingInAnInnings <chr> "7/42", "8/15", "8/34", "8/43", "8/31",...
## $ bestBowlingInAMatch    <chr> "11/71", "11/121", "13/106", "9/92", "1...
## $ bowlingAvg             <chr> "27.23", "28.89", "28.40", "25.20", "21...
## $ economyRate            <chr> "2.90", "2.96", "2.99", "2.83", "2.61",...
## $ bowlingStrikeRate      <chr> "56.2", "58.3", "56.9", "53.4", "49.4",...
## $ fiveWicketsInAnInnings <chr> "25", "16", "27", "16", "17", "17", "17...
## $ tenWicketsInAMatch     <chr> "3", "2", "4", "0", "3", "6", "3", "1",...
## $ x300WicketsTaken       <int> 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ x25_00BowlingAvg       <int> 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, ...
rawAllRounderData = read_csv("cricinfo-statsguru-data/Test Matches - All Round.csv") %>% 
  janitor::clean_names(case = "small_camel")
## Parsed with column specification:
## cols(
##   Player = col_character(),
##   `Batting-Bowling Avg Diff` = col_double(),
##   `All-Round Ind` = col_integer()
## )
glimpse(rawAllRounderData)
## Observations: 25
## Variables: 3
## $ player                <chr> "AW Greig (1972-1977)", "IT Botham (1977...
## $ battingBowlingAvgDiff <dbl> 8.23, 5.14, 3.22, 0.52, -1.45, -8.28, 13...
## $ allRoundInd           <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
inningsThres = 40

Cleaning data

Batting data

cleanedBattingData = rawBattingData %>% 
  dplyr::mutate(
    player = str_replace(player, " \\([^>]+\\)", ""),
    inningsBatted = as.integer(inningsBatted),
    notOuts = as.integer(notOuts), 
    runsScored = as.numeric(runsScored),
    battingAvg = as.numeric(battingAvg),
    hundredsScored = as.integer(hundredsScored),
    scoresOfFiftyOrMore = as.integer(scoresOfFiftyOrMore),
    ducksScored = as.integer(ducksScored)
  ) %>% 
  dplyr::select(
    -careerSpan, 
    -highestInningsScore, 
    -playerCount) %>% 
  dplyr::filter(inningsBatted > inningsThres)
## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion

## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion

## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion

## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion

## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion

## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion

## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion
glimpse(cleanedBattingData)
## Observations: 620
## Variables: 15
## $ player                 <chr> "AN Cook", "GA Gooch", "AJ Stewart", "D...
## $ careerStart            <int> 2006, 1975, 1990, 1978, 2005, 1964, 198...
## $ careerEnd              <int> 2018, 1995, 2003, 1992, 2014, 1982, 200...
## $ matchesPlayed          <int> 156, 118, 133, 117, 104, 108, 115, 118,...
## $ inningsBatted          <int> 282, 215, 235, 204, 181, 193, 212, 205,...
## $ notOuts                <int> 16, 6, 21, 18, 8, 23, 7, 24, 15, 16, 6,...
## $ runsScored             <dbl> 12145, 8900, 8463, 8231, 8181, 8114, 77...
## $ highestInningsScoreNum <chr> "294", "333", "190", "215", "227", "246...
## $ battingAvg             <dbl> 45.65, 42.58, 39.54, 44.25, 47.28, 47.7...
## $ hundredsScored         <int> 32, 20, 15, 18, 23, 22, 16, 22, 22, 22,...
## $ scoresOfFiftyOrMore    <int> 56, 46, 45, 39, 35, 42, 46, 46, 38, 24,...
## $ ducksScored            <int> 8, 13, 14, 7, 10, 10, 20, 14, 9, 4, 15,...
## $ country                <chr> "England", "England", "England", "Engla...
## $ x10000RunsScored       <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ x50BattingAvg          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, ...

Bowling data

cleanedBowlingData = rawBowlingData %>% 
  dplyr::mutate(
    player = str_replace(player, " \\([^>]+\\)", ""),
    inningsBowledIn = as.integer(inningsBowledIn), 
    ballsBowled = as.integer(ballsBowled), 
    runsConceded = as.integer(runsConceded), 
    wicketsTaken = as.integer(wicketsTaken),
    bowlingAvg = as.numeric(bowlingAvg),
    economyRate = as.numeric(economyRate),
    bowlingStrikeRate = as.numeric(bowlingStrikeRate),
    fiveWicketsInAnInnings = as.integer(fiveWicketsInAnInnings),
    tenWicketsInAMatch = as.integer(tenWicketsInAMatch),
    isBowler = ifelse(wicketsTaken < 50, "Not bowler", "bowler")
  ) %>% 
  tidyr::separate(bestBowlingInAnInnings, 
                  into = c("mostWicketsInnings", "mostWicketsInningsRuns"), 
                  sep = "/") %>% 
  tidyr::separate(bestBowlingInAMatch, 
                  into = c("mostWicketsMatch", "mostWicketsMatchRuns"), 
                  sep = "/") %>% 
  na.omit() %>% 
  dplyr::filter(inningsBowledIn > inningsThres)
## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion

## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion

## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion

## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion

## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion

## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion

## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion

## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion

## Warning in eval(substitute(expr), envir, enclos): NAs introduced by
## coercion
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1178 rows
## [398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412,
## 413, 414, 415, 416, 417, ...].

## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1178 rows
## [398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412,
## 413, 414, 415, 416, 417, ...].
glimpse(cleanedBowlingData)
## Observations: 325
## Variables: 17
## $ player                 <chr> "JM Anderson", "SCJ Broad", "IT Botham"...
## $ inningsBowledIn        <int> 257, 215, 168, 165, 127, 151, 109, 129,...
## $ ballsBowled            <int> 30398, 24346, 21815, 17357, 15178, 2186...
## $ runsConceded           <int> 14705, 12050, 10878, 8190, 6625, 7674, ...
## $ wicketsTaken           <int> 540, 417, 383, 325, 307, 297, 255, 252,...
## $ mostWicketsInnings     <chr> "7", "8", "8", "8", "8", "8", "6", "7",...
## $ mostWicketsInningsRuns <chr> "42", "15", "34", "43", "31", "51", "65...
## $ mostWicketsMatch       <chr> "11", "11", "13", "9", "12", "13", "10"...
## $ mostWicketsMatchRuns   <chr> "71", "121", "106", "92", "119", "71", ...
## $ bowlingAvg             <dbl> 27.23, 28.89, 28.40, 25.20, 21.57, 25.8...
## $ economyRate            <dbl> 2.90, 2.96, 2.99, 2.83, 2.61, 2.10, 2.9...
## $ bowlingStrikeRate      <dbl> 56.2, 58.3, 56.9, 53.4, 49.4, 73.6, 60....
## $ fiveWicketsInAnInnings <int> 25, 16, 27, 16, 17, 17, 17, 9, 7, 15, 1...
## $ tenWicketsInAMatch     <int> 3, 2, 4, 0, 3, 6, 3, 1, 1, 5, 1, 0, 1, ...
## $ x300WicketsTaken       <int> 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ x25_00BowlingAvg       <int> 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, ...
## $ isBowler               <chr> "bowler", "bowler", "bowler", "bowler",...

All rounder data

cleanedAllRounderData = rawAllRounderData %>% 
  dplyr::mutate(
    player = str_replace(player, " \\([^>]+\\)", "")
  )

Numeric batting data

numBattingData = cleanedBattingData %>% 
  dplyr::select_if(is.numeric) %>% 
  bind_cols(cleanedBattingData %>% select(player)) %>% 
  dplyr::mutate(
    logRuns = log10(runsScored)
  ) %>% 
  dplyr::select(
    -runsScored, 
    -careerStart, 
    -careerEnd
  ) %>% 
  dplyr::filter(
    !is.infinite(logRuns)
  ) %>% 
  na.omit

dim(numBattingData)
## [1] 620  11
numBattingMatrix = numBattingData %>% 
  dplyr::select(-player) %>% 
  as.data.frame %>% as.matrix %>% scale

Monocle

library(monocle)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:S4Vectors':
## 
##     expand
## The following object is masked from 'package:tidyr':
## 
##     expand
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following objects are masked from 'package:ClassifyR':
## 
##     featureNames, sampleNames
## Loading required package: VGAM
## Loading required package: splines
## 
## Attaching package: 'VGAM'
## The following object is masked from 'package:tidyr':
## 
##     fill
## Loading required package: DDRTree
## Loading required package: irlba
monocle_batting = cleanedBattingData
#Do not run
monocle_matrix = as.matrix(as.data.frame(monocle_batting %>% dplyr::select_if(is.numeric))) %>% t
colnames(monocle_matrix) = 1:ncol(monocle_matrix)
monocle_pData = new("AnnotatedDataFrame", 
                    as.data.frame(
                      monocle_batting %>%
                        dplyr::left_join(cleanedBowlingData, by = "player") %>% 
                        dplyr::mutate(
                          isBowler = coalesce(isBowler, "Not bowler")
                        )
                      
                      
                    ))

identical(colnames(monocle_matrix), rownames(monocle_pData))
## [1] TRUE
monocle_fData_pre = data.frame(gene_short_name = rownames(monocle_matrix))
rownames(monocle_fData_pre) = monocle_fData_pre$gene_short_name
monocle_fData = new("AnnotatedDataFrame", 
                    data = monocle_fData_pre)
identical(rownames(monocle_fData), rownames(monocle_matrix))
## [1] TRUE
cricket <- newCellDataSet(
  monocle_matrix,
  phenoData = monocle_pData,
  featureData = monocle_fData,
  expressionFamily=negbinomial.size())

cricket <- estimateSizeFactors(cricket)
cricket <- estimateDispersions(cricket)
## Warning in log(ifelse(y == 0, 1, y/mu)): NaNs produced
## Warning: step size truncated due to divergence
## Removing 2 outliers
diff_test_res <- differentialGeneTest(cricket,
                                      fullModelFormulaStr = "~isBowler")
ordering_genes <- row.names (subset(diff_test_res, qval < 0.01))
cricket <- setOrderingFilter(cricket, ordering_genes)
plot_ordering_genes(cricket)
## Warning: Transformation introduced infinite values in continuous y-axis

cricket <- reduceDimension(cricket, max_components = 2,
    method = 'DDRTree')

cricket <- orderCells(cricket)

tmp = plot_cell_trajectory(cricket, 
                           color_by = "isBowler")
  # geom_text(aes(label = player), size = 1)

tmp

plotly::ggplotly(tmp)

K means Clustering

kmeansObj = kmeans(x = numBattingMatrix, centers = 2)
kmeansObj
## K-means clustering with 2 clusters of sizes 492, 128
## 
## Cluster means:
##   matchesPlayed inningsBatted    notOuts battingAvg hundredsScored
## 1    -0.3832445    -0.4030279 -0.1759804 -0.2665595     -0.3849074
## 2     1.4730960     1.5491385  0.6764246  1.0245879      1.4794876
##   scoresOfFiftyOrMore ducksScored x10000RunsScored x50BattingAvg   logRuns
## 1          -0.3850548  -0.1441265       -0.1462267    -0.2345727 -0.334925
## 2           1.4800545   0.5539863        0.5620590     0.9016389  1.287368
## 
## Clustering vector:
##   [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 1 1 2 2 1 2
##  [36] 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [71] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [106] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [141] 2 2 2 2 2 1 2 2 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [176] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [211] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 1
## [246] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [281] 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 2 1 1 1 1 1 1
## [316] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1
## [351] 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [386] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [421] 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1
## [456] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [491] 1 1 1 1 2 2 2 2 2 2 2 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [526] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2
## [561] 2 2 2 2 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 1 1
## [596] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 1964.390 2040.033
##  (between_SS / total_SS =  35.3 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

PCA

pcaObj = prcomp(x = numBattingMatrix)


library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:S4Vectors':
## 
##     space
## The following object is masked from 'package:stats':
## 
##     lowess
venn(
  list(
    battingPlayers = numBattingData$player,
    bowlingPlayers = cleanedBowlingData$player
  )
)

pcaDataFrame = tibble(
  pca1 = pcaObj$x[,1],
  pca2 = pcaObj$x[,2],
  player = numBattingData$player,
  kmeans = as.factor(kmeansObj$cluster)
) %>% 
  dplyr::left_join(cleanedBowlingData, by = "player") %>% 
  dplyr::mutate(
    isBowler = coalesce(isBowler, "Not bowler")
  )

table(pcaDataFrame$isBowler, 
      pcaDataFrame$kmeans)
##             
##                1   2
##   bowler     198  21
##   Not bowler 294 107
p1 = pcaDataFrame %>% 
  ggplot(aes(x = pca1,
             y = pca2,
             colour = isBowler,
             shape = kmeans,
             label = player)) +
  geom_point()


library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:S4Vectors':
## 
##     rename
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plotly::ggplotly(p1)

Supervised learning

# DMresults <- ClassifyR::runTests(numBattingMatrix, 
#                       classes = factor(pcaDataFrame$isBowler), 
#                       datasetName = "Batting",
#                       classificationName = "Different Means", 
#                       permutations = 20, folds = 5,
#                       seed = 2018, verbose = 1)
# DMresults

library(SmokyScotch)
## Warning: replacing previous import 'magrittr::set_names' by
## 'purrr::set_names' when loading 'SmokyScotch'
## Warning: replacing previous import 'ggplot2::margin' by
## 'randomForest::margin' when loading 'SmokyScotch'
## Warning: replacing previous import 'dplyr::combine' by
## 'randomForest::combine' when loading 'SmokyScotch'
svmMultiResult = svmCV_multi(x = numBattingMatrix, 
                             y = factor(pcaDataFrame$isBowler),
                             nFolds = 5, nExp = 100,
                             cores = 5)

logitMultiResult = logitCV_multi(
  x = data.frame(numBattingMatrix), 
  y = as.integer(factor(pcaDataFrame$isBowler)) -1L,
  nFolds = 5, nExp = 100,
  cores = 5)

rfMultiResult = rfCV_multi(
  x = data.frame(numBattingMatrix), 
  y = factor(pcaDataFrame$isBowler),
  nFolds = 5, nExp = 100,
  cores = 5)

svmMultiError = svmMultiResult %>% purrr::map_dbl("svmMeanError")
logitMultiError = logitMultiResult %>% purrr::map_dbl("logitMeanError")
rfMultiError = rfMultiResult %>% purrr::map_dbl("rfMeanError")

boxplot(
  data.frame(svmMultiError, 
             logitMultiError,
             rfMultiError)
)

predictMatrix1 = purrr::map(svmMultiResult, "svmPredictOrderedVector") %>%
  purrr::map(as.character) %>%
  do.call(rbind,.)

predictMatrix2 = purrr::map(logitMultiResult, "logitPredictIntOrderedVector") %>%
  purrr::map(as.character) %>%
  do.call(rbind,.)

predictMatrix3 = purrr::map(rfMultiResult, "rfPredictOrderedVector") %>%
  purrr::map(as.character) %>%
  do.call(rbind,.)

classifierMatrix = rbind(
  binaryClassScores(y = factor(pcaDataFrame$isBowler), 
                    predictMatrix = predictMatrix1),
  binaryClassScores(y = as.integer(as.factor(pcaDataFrame$isBowler)) -1L, 
                    predictMatrix = predictMatrix2),
  binaryClassScores(y = factor(pcaDataFrame$isBowler),
                    predictMatrix = predictMatrix3)
)
rownames(classifierMatrix) = c("SVM", "Logit", "RF")

compareBinaryClassResults(
  y = factor(pcaDataFrame$isBowler), 
  classifierMatrix)

svmData = cbind(pcaDataFrame, 
                svmScore = binaryClassScores(y = factor(pcaDataFrame$isBowler), 
                                             predictMatrix = predictMatrix1)) %>% mutate(isAllRounder = player %in% cleanedAllRounderData$player)



svmData %>% 
  group_by(isAllRounder) %>% 
  summarise(
    meanSvmScore = mean(svmScore)
  )
## # A tibble: 2 x 2
##   isAllRounder meanSvmScore
##   <lgl>               <dbl>
## 1 FALSE               0.870
## 2 TRUE                0.121
svmData %>% 
  ggplot(aes(x = isAllRounder,
             y = svmScore)) +
  geom_boxplot()